PBJ 야구

데이터 전처리

df <- read.csv("kbopitchingdata.csv")
# View(df)
str(df)
'data.frame':   323 obs. of  34 variables:
 $ id                 : int  1 2 3 4 5 6 7 8 9 10 ...
 $ year               : int  2021 2021 2021 2021 2021 2021 2021 2021 2021 2021 ...
 $ team               : chr  "LG Twins" "KT Wiz" "Doosan Bears" "Samsung Lions" ...
 $ average_age        : num  26.3 28.4 27.5 28.8 27.7 25.8 27.3 27 25.3 27.1 ...
 $ runs_per_game      : num  3.9 4.06 4.57 4.57 4.8 4.89 5.13 5.13 5.22 5.64 ...
 $ wins               : int  72 75 70 75 67 69 66 49 58 64 ...
 $ losses             : int  57 59 65 59 67 67 63 82 75 71 ...
 $ win_loss_percentage: num  0.558 0.56 0.519 0.56 0.5 0.507 0.512 0.374 0.436 0.474 ...
 $ ERA                : num  3.57 3.67 4.28 4.29 4.5 4.33 4.8 4.67 4.89 5.39 ...
 $ run_average_9      : num  3.96 4.17 4.66 4.7 4.95 5.02 5.2 5.29 5.33 5.75 ...
 $ games              : int  143 143 143 143 143 143 143 143 143 143 ...
 $ games_started      : int  143 143 143 143 143 143 143 143 143 143 ...
 $ games_finished     : int  143 141 141 141 140 142 143 142 142 142 ...
 $ complete_game      : int  0 2 2 2 3 1 0 1 1 1 ...
 $ shutouts           : int  18 6 10 14 10 7 5 6 6 12 ...
 $ saves              : int  32 33 27 46 33 30 25 21 36 36 ...
 $ innings_pitched    : num  1264 1255 1260 1250 1247 ...
 $ hits               : int  1117 1166 1288 1287 1256 1276 1283 1200 1274 1330 ...
 $ runs               : int  557 581 653 653 686 699 733 734 746 806 ...
 $ earned_runs        : int  501 512 599 596 624 604 676 648 684 756 ...
 $ home_runs          : int  79 85 104 129 122 100 147 114 133 132 ...
 $ walks              : int  542 486 586 526 585 566 623 669 616 653 ...
 $ intentional_walks  : int  17 18 16 13 14 27 27 22 5 19 ...
 $ strikeouts         : int  1062 1051 1037 1031 1046 893 1006 1006 946 1047 ...
 $ hit_batter         : int  97 42 73 51 77 80 78 101 104 86 ...
 $ balks              : int  5 1 7 3 8 4 9 11 9 5 ...
 $ wild_pitches       : int  43 56 51 56 74 58 40 56 58 102 ...
 $ batters_faced      : int  5416 5359 5596 5496 5575 5568 5661 5633 5658 5722 ...
 $ WHIP               : num  1.31 1.32 1.49 1.45 1.48 ...
 $ hits_9             : num  8 8.4 9.2 9.3 9.1 9.2 9.1 8.6 9.1 9.5 ...
 $ homeruns_9         : num  0.6 0.6 0.7 0.9 0.9 0.7 1 0.8 1 0.9 ...
 $ walks_9            : num  3.9 3.5 4.2 3.8 4.2 4.1 4.4 4.8 4.4 4.7 ...
 $ strikeouts_9       : num  7.6 7.5 7.4 7.4 7.5 6.4 7.1 7.2 6.8 7.5 ...
 $ strikeout_walk     : num  1.96 2.16 1.77 1.96 1.79 1.58 1.61 1.5 1.54 1.6 ...
# 결측치가 있어서 필요 없는 컬럼 제거
df_01 <- subset(df, select=-c(games_started,games_finished,intentional_walks, balks, wild_pitches))

# 연도별 바뀐 팀들을 현대의 이름으로 재정렬
for (i in (1:length(df_01$team))){
  if(df_01$team[i] == 'MBC Blue Dragons'){
    df_01$team[i] = 'LG Twins'
  } else if(df_01$team[i] == 'OB Bears'){
    df_01$team[i] = 'Doosan Bears'
  } else if(df_01$team[i] == 'Nexen Heroes' | df_01$team[i] == 'Woori Heroes'){
    df_01$team[i] = 'Kiwoom Heroes'
  } else if(df_01$team[i] == 'SK Wyverns'){
    df_01$team[i] = 'SSG Landers'
  } else if(df_01$team[i] == 'Binggre Eagles'){
    df_01$team[i] = 'Hanwha Eagles'
  } else if(df_01$team[i] == 'Haitai Tigers'){
    df_01$team[i] = 'Kia Tigers'
  } else if(df_01$team[i] == 'Pacific Dolphins' | df_01$team[i] == 'Chungbo Pintos' | df_01$team[i] == 'Sammi Superstars'){
    df_01$team[i] = 'Hyundai Unicorns'
  }
}

KBO 리그 선수들의 나이 : Horizontal Violin Graph

https://r-graph-gallery.com/violin_horizontal_ggplot2.html

# Libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(forcats)
library(hrbrthemes)
library(viridis)

# library(extrafont)
# font_import(paths=NULL, recursive = TRUE, prompt=TRUE, pattern=NULL) # "윈도우즈 폰트데이터베이스에서 찾을 수 없는 폰트페밀리입니다" 오류 해결, 30분 이상 소요

# Plot

p <- df_01 |>
  ggplot( aes(x=team, y=average_age, fill=team, color=team)) +
    geom_violin(width=1.6, size=0.5) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    theme_ipsum() +
    theme(
      legend.position="none"
    ) +
    coord_flip() + # This switch X and Y axis and allows to get the horizontal version
    xlab("") +
    ylab("age")
p

https://plotly.com/r/violin/

library(plotly)

fig <- df_01 %>%
  plot_ly(
    x = ~team,
    y = ~average_age,
    split = ~team,
    type = 'violin',
    box = list(
      visible = T
    ),
    meanline = list(
      visible = T
    )
  ) 

fig <- fig %>%
  layout(
    xaxis = list(
      title = "KBO TEAM"
    ),
    yaxis = list(
      title = "AGE",
      zeroline = F
    )
  )

fig

KBO 리그 승률, 승 : Stacked Funnel Plot

https://plotly.com/r/funnel-charts/

# library(showtext) # 다운로드 없이 구글 제공 폰트 사용
# font_add_google("Gochi Hand", "gochi")

# Need to install plotly from Github to get funnel plots
# devtools::install_github("ropensci/plotly")

library(plotly)
df_2021 <- subset(df_01, year==2021) # 2021년 데이터만 추출
df_2021<-df_2021 |>
  arrange(desc(wins)) # wins 내림차순으로 정렬

fig <- plot_ly(
    type = "funnel",
    name = 'win_loss_percentage',
    y = df_2021$team,
    x = df_2021$win_loss_percentage*100,
    textposition = "inside",
    textinfo = "value+percent total")

fig <- fig %>%
  add_trace(
    type = "funnel",
    name = 'wins',
    y = df_2021$team,
    x = df_2021$wins,
    textposition = "inside",
    textinfo = "value+percent total")

fig <- fig %>%
  layout(yaxis = list(categoryarray = c(1:6)))

fig

KBO 선수들의 실책 : 3D Bubble Plot

https://plotly.com/r/3d-scatter-plots/

library(plotly)

fig <- plot_ly(df_2021, x = ~runs, y = ~hits, z = ~walks, color = ~team, size = ~wins,
             marker = list(symbol = 'circle', sizemode = 'diameter'), sizes = c(25, 50),
             text = ~paste('runs : ', runs, '<br>hits : ', hits, '<br> walks :', walks, '<br> wins : ', wins))
fig <- fig %>% layout(title = '선수들의 실책',
                      
         scene = list(xaxis = list(title = 'runs',
                      gridcolor = 'rgb(255, 255, 255)',
                      type = 'log',
                      zerolinewidth = 1,
                      ticklen = 5,
                      gridwidth = 2),
                      
               yaxis = list(title = 'hits',
                      gridcolor = 'rgb(255, 255, 255)',
                      zerolinewidth = 1,
                      ticklen = 5,
                      gridwith = 2),
               
               zaxis = list(title = 'walks',
                            gridcolor = 'rgb(255, 255, 255)',
                            type = 'log',
                            zerolinewidth = 1,
                            ticklen = 5,
                            gridwith = 2)),
         
         paper_bgcolor = 'rgb(243, 243, 243)',
         plot_bgcolor = 'rgb(243, 243, 243)')

fig